/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.fetcher;
import net.nutch.net.protocols.Response;
import net.nutch.pagedb.FetchListEntry;
import net.nutch.net.protocols.http.*;
import net.nutch.net.protocols.ftp.*;
import net.nutch.io.*;
import net.nutch.db.*;
import net.nutch.util.*;
import net.nutch.util.RobotsMetaProcessor.*;
import java.io.*;
import java.net.*;
import java.text.*;
import java.util.*;
import java.util.logging.*;
import org.cyberneko.html.parsers.*;
import org.xml.sax.*;
import org.w3c.dom.*;
import org.w3c.dom.html.*;
import org.apache.html.dom.*;
/***************************************
* A simple Fetcher, now adorned with new features,
* such as hostname bans and politeness constraints.
*
* @author Doug Cutting, added to by Mike Cafarella
***************************************/
public class Fetcher {
//
// This seems like real overkill. Time till we remove robots.txt from cache.
// Make it 1 day.
//
static final long DEFAULT_ROBOTS_LIFETIME = 1 * 24 * 60 * 60 * 1000;
//
// Min robots lifetime
//
static final long MINIMUM_ROBOTS_LIFETIME = 5 * 60 * 1000;
// delay between hitting same host
private long serverDelay =
NutchConf.getInt("fetcher.server.delay", 1) * 1000;
static final String AGENT_NAME = NutchConf.get("http.agent.name");
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.fetcher.Fetcher");
private ArrayFile.Reader fetchList; // the input
private ArrayFile.Writer fetcherDb; // the output
private ArrayFile.Writer rawDb;
private ArrayFile.Writer strippedDb;
private TrieStringMatcher hostnameBans[];
private int threadCount = // max number of threads
NutchConf.getInt("fetcher.threads.fetch", 10);
private long start; // start time of fetcher run
private long bytes; // total bytes fetched
private int pages; // total pages fetched
private int errors; // total pages errored
private ThreadGroup group = new ThreadGroup("fetcher"); // our thread group
private int timeout = -1;
private Http http = new Http();
private RobotRulesParser robotRulesParser;
private Hashtable robotRulesCache = new Hashtable();
private TreeSet deadHosts = new TreeSet();
/*********************************************
* BlockedHost class keeps track of a pair
* consisting of (hostname, timestamp). Used for
* sorting when a target hostname is ready to
* hit.
********************************************/
class BlockedHost {
String hostname;
long readyTime;
public BlockedHost(String hostname) {
this.hostname = hostname;
this.readyTime = System.currentTimeMillis() + serverDelay;
}
public String getHostname() {
return hostname;
}
public long getReadyTime() {
return readyTime;
}
}
Comparator blockedHostComparator = new Comparator() {
public int compare(Object o1, Object o2) {
BlockedHost bh1 = (BlockedHost) o1;
BlockedHost bh2 = (BlockedHost) o2;
int diff = (int) (bh1.getReadyTime() - bh2.getReadyTime());
if (diff == 0) {
return bh1.getHostname().compareTo(bh2.getHostname());
} else {
return diff;
}
}
};
TreeSet blockedHostsByOrder = new TreeSet(blockedHostComparator);
TreeSet blockedHostsByName = new TreeSet();
TreeMap blockedPendingQueues = new TreeMap();
TreeMap readyPendingQueues = new TreeMap();
/********************************************
* Fetcher thread
********************************************/
private class FetcherThread extends Thread {
private DOMFragmentParser parser = new DOMFragmentParser();
private RobotsMetaIndicator robotsMeta = new RobotsMetaIndicator();
private Ftp ftp = null; // one instance per thread
private int timeout = -1;
/**
*/
public FetcherThread() {
super(group, "starting");
}
public void setTimeout(int timeout) {
this.timeout = timeout;
}
/**
* This thread keeps looping, grabbing an item off the list
* of URLs to be fetched (in a thread-safe way). It checks
* whether the URL is OK to download. If so, we do it.
*/
public void run() {
this.ftp = new Ftp();
if (this.timeout != -1)
this.ftp.setTimeout(this.timeout);
boolean hasDiskItems = true;
while (true) {
if (LogFormatter.hasLoggedSevere())
break;
FetchListEntry fle = null;
String urlString = null;
try {
setName("starting");
//
// Unblock any hosts that might be done and past the
// delay time
//
synchronized (blockedHostsByOrder) {
// Check to see if any hosts should be unblocked
while ((blockedHostsByOrder.size() > 0) &&
(((BlockedHost) blockedHostsByOrder.first()).getReadyTime() < System.currentTimeMillis())) {
BlockedHost blockedHost = (BlockedHost) blockedHostsByOrder.first();
blockedHostsByOrder.remove(blockedHost);
blockedHostsByName.remove(blockedHost.getHostname());
// There's now a host that's newly-unblocked. Move its
// pending queue from blocked to ready.
synchronized (blockedPendingQueues) {
LinkedList readyQueue = (LinkedList) blockedPendingQueues.get(blockedHost.getHostname());
if (readyQueue != null) {
blockedPendingQueues.remove(blockedHost.getHostname());
readyPendingQueues.put(blockedHost.getHostname(), readyQueue);
}
}
}
}
//
// Grab next item.
//
// First, check if there is any work in the readyPendingQueue.
//
synchronized (blockedPendingQueues) {
while (fle == null && readyPendingQueues.size() > 0) {
String readyHost = (String) readyPendingQueues.firstKey();
LinkedList readyQueue = (LinkedList) readyPendingQueues.get(readyHost);
if (readyQueue.size() > 0) {
fle = (FetchListEntry) readyQueue.removeFirst();
}
if (readyQueue.size() == 0) {
readyPendingQueues.remove(readyHost);
}
}
}
//
// Second, if there was no pending work ready to be processed,
// we get a URL off the fetchlist
//
if (fle == null && hasDiskItems) {
fle = (FetchListEntry)fetchList.next(new FetchListEntry());
if (fle == null) {
hasDiskItems = false;
}
}
//
// If we still haven't found an FLE, but there is still
// stuff waiting in the delay queue, then all we can do is
// wait and repeat the loop.
//
// Otherwise exit.
//
if (fle == null) {
boolean waitAndContinue = false;
long targetTime = 0;
synchronized (blockedHostsByOrder) {
if (blockedHostsByOrder.size() > 0) {
waitAndContinue = true;
targetTime = ((BlockedHost) blockedHostsByOrder.first()).getReadyTime();
}
}
if (waitAndContinue) {
long waitTime = targetTime - System.currentTimeMillis();
if (waitTime > 0) {
try {
Thread.sleep(waitTime);
} catch (InterruptedException ie) {
}
}
continue;
} else {
break;
}
}
//
// OK! We now have the URL and will subject it to
// a few tests
//
urlString = fle.getPage().getURL().toString();
URL url = new URL(urlString);
//
// 1. Check hostname
//
String hostname = url.getHost().toLowerCase();
if (hostnameBans != null) {
for (int i = 0; i < hostnameBans.length; i++) {
if (hostnameBans[i].matches(hostname)) {
LOG.fine("Hostname banned for " + urlString);
handleNoFetch(fle, FetcherOutput.NOT_FOUND);
continue;
}
}
}
//
// 2. Check FLE whether we should fetch at all
//
if (!fle.getFetch()) {
LOG.fine("not fetching " + urlString);
handleNoFetch(fle, FetcherOutput.SUCCESS);
continue;
}
//
// 3. Check whether the host is dead
//
if (deadHosts.contains(hostname)) {
LOG.fine("host dead for " + urlString);
handleNoFetch(fle, FetcherOutput.RETRY);
continue;
}
//
// 4. Make sure we there is no pending host-delay on
// the host. Otherwise this URL will need to be deferred
// till the host-delay expires. This might not be an issure
// for large crawls, but it's very important for small ones.
//
// (Small crawls may try to obtain several hundred URLS
// from the same host and little else. These hosts will
// quickly shut down the fetcher unless it inserts delays
// between fetch attempts.)
//
synchronized (blockedHostsByOrder) {
synchronized (blockedPendingQueues) {
// If blocked, store the FLE and continue
if (blockedHostsByName.contains(hostname)) {
LinkedList blockedQueue = (LinkedList) blockedPendingQueues.get(hostname);
blockedQueue.add(fle);
continue;
} else {
// If free, move into blocked state but go on and
// process the FLE.
BlockedHost bh = new BlockedHost(hostname);
blockedHostsByName.add(bh.getHostname());
blockedHostsByOrder.add(bh);
LinkedList readyQueue = (LinkedList) readyPendingQueues.remove(hostname);
if (readyQueue == null) {
readyQueue = new LinkedList();
}
blockedPendingQueues.put(hostname, readyQueue);
}
}
}
//
// 5. Check robots, fetching if necessary
//
RobotRulesParser.RobotRuleSet robotRules = (RobotRulesParser.RobotRuleSet) robotRulesCache.get(hostname);
if (robotRules == null || (System.currentTimeMillis() > robotRules.getExpireTime())) {
// Remove from cache if expired
if (robotRules != null) {
robotRulesCache.remove(hostname);
}
try {
// Obtain robots.txt from the INTERNET!
URL robotURL = new URL(url.getProtocol(), url.getHost(), url.getPort(), "/robots.txt");
Response robotResponse = null;
if ("http".equals(url.getProtocol())) {
if (this.timeout != -1)
http.setTimeout(this.timeout);
robotResponse = http.getResponse(robotURL);
} else if ("ftp".equals(url.getProtocol())) {
robotResponse = this.ftp.getResponse(robotURL);
}
// If the robots.txt HTTP xfer worked,
if (robotResponse.getCode() == 200) {
// Parse the file
robotRules = robotRulesParser.parseRules(robotResponse.getContent());
// Set expiration policy
long expireTime = System.currentTimeMillis() + DEFAULT_ROBOTS_LIFETIME;
String expireStr = robotResponse.getHeader("Expires");
if (expireStr != null) {
try {
Date date = DateFormat.getDateInstance(DateFormat.LONG).parse(expireStr);
expireTime = date.getTime();
long min = System.currentTimeMillis() + MINIMUM_ROBOTS_LIFETIME;
if (expireTime < min) {
expireTime = min;
}
} catch (Exception e) {
}
}
robotRules.setExpireTime(expireTime);
} else if (robotResponse.getCode() >= 400) {
// Robots.txt not available, but server's there.
// Just use default robots.
} else {
// Robots.txt can't be loaded because server's not
// there. Mark this host as kaput.
deadHosts.add(hostname);
handleNoFetch(fle, FetcherOutput.RETRY);
continue;
}
} catch (Exception e) {
}
// Cache the resulting robotRules object. Create it,
// if it hasn't been created yet.
if (robotRules == null) {
// robots.txt cannot be loaded; anything goes, boys!
robotRules = robotRulesParser.getEmptyRules();
robotRules.setExpireTime(System.currentTimeMillis() + DEFAULT_ROBOTS_LIFETIME);
}
robotRulesCache.put(hostname, robotRules);
}
//
// OK, we are guaranteed to have a valid robots at this pt.
//
String path = url.getFile();
if ((path == null) || "".equals(path)) {
path= "/";
}
if (! robotRules.isAllowed(path)) {
handleNoFetch(fle, FetcherOutput.NOT_FOUND);
}
//
// FINALLY!
// Passed tests so let's grab it.
//
LOG.info("fetching " + url);
setName(urlString);
Response response = null;
if ("http".equals(url.getProtocol())) {
if (this.timeout != -1)
http.setTimeout(this.timeout);
response = http.getResponse(url);
} else if ("ftp".equals(url.getProtocol())) {
response = this.ftp.getResponse(url);
}
handleFetch(url, fle, response);
//
// Record the results. A failure will throw an exception.
//
synchronized (Fetcher.this) {
pages++;
bytes += response.getContent().length;
// Show status every 100pp
if ((pages % 100) == 0) {
status();
}
}
} catch (HttpError e) {
logError(urlString, fle, e);
// mostly 401's and 403's: page not found
handleNoFetch(fle, FetcherOutput.NOT_FOUND);
} catch (SocketException e) {
logError(urlString, fle, e);
// timeout, dns or connect error: retry
handleNoFetch(fle, FetcherOutput.RETRY);
} catch (Throwable t) {
if (fle != null) {
logError(urlString, fle, t);
handleNoFetch(fle, FetcherOutput.NOT_FOUND);
}
}
}
//LOG.info("deleting ftp");
this.ftp = null;
System.gc();
return;
}
private void logError(String urlString,
FetchListEntry fle, Throwable t) {
LOG.info("fetch of " + urlString + " failed with: " + t);
synchronized (Fetcher.this) { // record failure
errors++;
}
}
/**
*/
private void handleFetch(URL url, FetchListEntry fle,
Response response)
throws IOException, SAXException {
String contentType = response.getHeader("Content-Type");
String text;
String title;
Outlink[] outlinks;
byte[] content;
if (contentType == null || contentType.startsWith("text/html")) {
DocumentFragment node = // parse content
new HTMLDocumentImpl().createDocumentFragment();
parser.parse(new InputSource
(new ByteArrayInputStream(response.getContent())),
node);
RobotsMetaProcessor.
getRobotsMetaDirectives(robotsMeta, node, url);
if (robotsMeta.getNoIndex()) {
text = ""; // ignore text and title
title = "";
} else { // extract text and title
StringBuffer sb = new StringBuffer();
DOMContentUtils.getText(sb, node);
text = sb.toString();
sb.setLength(0);
DOMContentUtils.getTitle(sb, node);
title = sb.toString().trim();
}
if (robotsMeta.getNoFollow()) { // ignore outlinks
outlinks = new Outlink[] {};
} else { // extract outlinks
URL baseURL = response.getUrl();
ArrayList l = new ArrayList();
DOMContentUtils.getOutlinks(baseURL, l, node);
outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);
LOG.fine("found " + outlinks.length + " outlinks in " + url);
}
if (robotsMeta.getNoCache()) {
content= new byte[0]; // ignore content
} else {
content = response.getContent(); // cache content
}
} else if (contentType.equals("text/plain")) {
text = new String(response.getContent());
title = "";
outlinks = new Outlink[] {};
content = response.getContent();
} else {
throw new IOException("Unknown content-type: " + contentType);
}
outputPage(new FetcherOutput(fle, MD5Hash.digest(response.getContent()),
FetcherOutput.SUCCESS,
title, outlinks),
new FetcherContent(content),
new FetcherText(text));
}
/**
*
*/
private void handleNoFetch(FetchListEntry fle, int status) {
outputPage(new FetcherOutput(fle, MD5Hash.digest(fle.getPage().getURL().toString()),
status, "", new Outlink[0]),
new FetcherContent(new byte[0]),
new FetcherText(""));
}
}
/**
*/
private void outputPage(FetcherOutput fo, FetcherContent raw,
FetcherText stripped) {
try {
synchronized (fetcherDb) {
fetcherDb.append(fo);
rawDb.append(raw);
strippedDb.append(stripped);
}
} catch (Throwable t) {
LOG.severe("error writing output:" + t.toString());
}
}
/**
* Constructs a fetcher.
*/
public Fetcher(String directory) throws IOException {
//
// Set up in/out streams
//
fetchList = new ArrayFile.Reader
(new File(directory, FetchListEntry.DIR_NAME).toString());
fetcherDb = new ArrayFile.Writer
(new File(directory, FetcherOutput.DIR_NAME).toString(), FetcherOutput.class);
rawDb = new ArrayFile.Writer
(new File(directory, FetcherContent.DIR_NAME).toString(), FetcherContent.class);
strippedDb = new ArrayFile.Writer
(new File(directory, FetcherText.DIR_NAME).toString(), FetcherText.class);
//
// Build robot rules parser. First, grab the agent names
// we advertise to robots files.
//
String agentNames = NutchConf.get("http.robots.agents");
StringTokenizer tok = new StringTokenizer(agentNames, ",");
ArrayList agents = new ArrayList();
while (tok.hasMoreTokens()) {
agents.add(tok.nextToken().trim());
}
//
// If there are no agents for robots-parsing, use our
// default agent-string. If both are present, our agent-string
// should be the first one we advertise to robots-parsing.
//
if (agents.size() == 0) {
agents.add(AGENT_NAME);
LOG.severe("No agents listed in 'http.robots.agents' property!");
} else if (!((String)agents.get(0)).equalsIgnoreCase(AGENT_NAME)) {
agents.add(0, AGENT_NAME);
LOG.severe("Agent we advertise (" + AGENT_NAME
+ ") not listed first in 'http.robots.agents' property!");
}
// Turn into string array and construct rule parser
this.robotRulesParser = new RobotRulesParser((String[]) agents.toArray(new String[agents.size()]));
// Load hostname bans
ArrayList bans = new ArrayList();
try {
LineNumberReader reader= new LineNumberReader(NutchConf.getConfResourceAsReader(NutchConf.get("excludehosts.suffix.file")));
ArrayList suffixStrings= new ArrayList();
String line;
while ((line= reader.readLine()) != null) {
// trim out comments and whitespace
int hashPos= line.indexOf("#");
if (hashPos >= 0) {
line = line.substring(0, hashPos);
}
line = line.trim();
if (line.length() > 0) {
line = line.toLowerCase();
suffixStrings.add(line);
}
}
bans.add(new SuffixStringMatcher(suffixStrings));
} catch (Exception e) {
LOG.warning("Not using hostNameSuffixBans: " + e.toString());
}
if (bans.size() > 0) {
this.hostnameBans = (TrieStringMatcher[]) bans.toArray(new TrieStringMatcher[bans.size()]);
} else {
this.hostnameBans = null;
}
if (this.timeout != -1)
this.http.setTimeout(this.timeout);
}
/**
* Set thread count
*/
public void setThreadCount(int threadCount) {
this.threadCount=threadCount;
}
// set timeout
public void setTimeout(int timeout) {
this.timeout = timeout;
}
/**
* Set delay between accesses to the same host.
*/
public void setServerDelay(long serverDelay) {
this.serverDelay=serverDelay;
}
/**
* Return the Http implementation.
*/
public Http getHttp() {
return http;
}
/**
* Set the logging level.
*/
public void setLogLevel(Level level) {
LOG.setLevel(level);
Http.LOG.setLevel(level);
Ftp.LOG.setLevel(level);
LOG.info("logging at " + level);
}
/**
* Runs the fetcher.
*/
public void run() throws IOException, InterruptedException {
start = System.currentTimeMillis();
for (int i = 0; i < threadCount; i++) { // spawn threads
FetcherThread thread = new FetcherThread();
if (this.timeout != -1)
thread.setTimeout(this.timeout);
thread.start();
}
do {
Thread.sleep(1000);
if (LogFormatter.hasLoggedSevere())
throw new RuntimeException("SEVERE error logged. Exiting fetcher.");
} while (group.activeCount() > 0); // wait for threads to finish
fetchList.close(); // close databases
fetcherDb.close();
rawDb.close();
strippedDb.close();
status(); // print final status
}
/**
* Display the status of the fetcher run.
*/
public synchronized void status() {
long ms = System.currentTimeMillis() - start;
LOG.info("status: "
+ pages + " pages, "
+ errors + " errors, "
+ bytes + " bytes, "
+ ms + " ms");
LOG.info("status: "
+ (((float)pages)/(ms/1000.0f))+" pages/s, "
+ (((float)bytes*8/1024)/(ms/1000.0f))+" kb/s, "
+ (((float)bytes)/pages) + " bytes/page");
}
/**
* Run the fetcher.
*/
public static void main(String[] args) throws Exception {
int timeout = -1;
int threadCount = -1;
long delay = -1;
boolean verbose = false;
boolean showThreadID = false;
String directory = null;
String usage = "Usage: Fetcher [-verbose] [-showThreadID] [-timeout N] [-threads M] [-delay O] dir";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-timeout")) { // found -timeout option
timeout = Integer.parseInt(args[++i]) * 1000;
} else if (args[i].equals("-threads")) { // found -threads option
threadCount = Integer.parseInt(args[++i]);
} else if (args[i].equals("-delay")) { // found -delay option
delay = Integer.parseInt(args[++i]);
} else if (args[i].equals("-verbose")) { // found -verbose option
verbose = true;
} else if (args[i].equals("-showThreadID")) { // found -showThreadID option
showThreadID = true;
} else if (i != args.length-1) {
System.err.println(usage);
System.exit(-1);
} else // root is required parameter
directory = args[i];
}
Fetcher fetcher = new Fetcher(directory); // make a Fetcher
if (timeout != -1) // set timeout option
fetcher.setTimeout(timeout);
//fetcher.getHttp().setTimeout(timeout);
if (threadCount != -1) // set threadCount option
fetcher.setThreadCount(threadCount);
if (delay != -1) // set delay option
fetcher.setServerDelay(delay * 1000); // convert seconds to milliseconds
if (showThreadID)
LogFormatter.setShowThreadIDs(showThreadID);
// set log level
fetcher.setLogLevel(verbose ? Level.FINE : Level.INFO);
fetcher.run(); // run the Fetcher
}
}